import pandas as pd
from urllib.parse import urlparse

# 读取数据
df = pd.read_excel('Both Data.xlsx')

# 筛选数据
is_root_event_choices = [0, 1]
quad_class_choices = [1, 2, 3, 4]  # 1、2、3、4任意选择
filtered_df = df[df['IsRootEvent'].isin(is_root_event_choices) & df['QuadClass'].isin(quad_class_choices)]

# 定义函数提取域名
def extract_domain(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        return domain[4:] if domain.startswith('www.') else domain
    except Exception:
        return None

# 应用函数提取域名，并将结果添加到原表格的新列中
df['Domain'] = df['SourceURL'].apply(extract_domain)

# 筛选后添加域名的DataFrame
filtered_df['Domain'] = filtered_df['SourceURL'].apply(extract_domain)

# 找出域名为空的URL
empty_domain_urls = filtered_df[filtered_df['Domain'].isnull() | (filtered_df['Domain'] == '')]

# 统计每个域名的出现次数，忽略空域名
domain_counts = filtered_df[filtered_df['Domain'].notnull() & (filtered_df['Domain'] != '')]['Domain'].value_counts()

# 保存域名统计结果到Excel
domain_counts.to_excel('Domain Numbers.xlsx')

# 保存无法识别域名的URL到另一个Excel
empty_domain_urls.to_excel('Empty-domains.xlsx', index=False)

# 保存更新后的原表格到新的Excel文件
df.to_excel('Both Data with Domains.xlsx', index=False)

# 打印或查看结果
print(df)
print(domain_counts)
print(empty_domain_urls)
